# Module 3 Exercise — Joins and Summaries
# Based on R4DS Chapters 12 (Tidy Data) and 13 (Relational Data)
# Practice reshaping and joining clinical data

# ===========================
# SETUP: Load Required Packages
# ===========================

library(tidyverse)  # Includes dplyr, tidyr for joins and pivots
library(lubridate)  # Date manipulation

# ===========================
# EXERCISE 1: Tidy Data Practice (R4DS Ch. 12)
# ===========================

# Create a wide format lab dataset (NOT TIDY)
lab_wide <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004"),
  ALT_BL = c(25, 30, 28, 35),
  AST_BL = c(22, 28, 26, 32),
  BILI_BL = c(0.8, 1.2, 0.9, 1.1),
  ALT_W2 = c(27, 32, 30, 38),
  AST_W2 = c(24, 30, 28, 35),
  BILI_W2 = c(0.9, 1.1, 1.0, 1.2),
  ALT_W4 = c(26, 29, 31, 36),
  AST_W4 = c(23, 27, 29, 33),
  BILI_W4 = c(0.8, 1.0, 1.1, 1.0)
)

cat("Lab data in wide format (NOT TIDY):\n")
print(lab_wide)

# 1a. Convert to long format (TIDY) using pivot_longer()
lab_long <- # YOUR CODE HERE - use pivot_longer() to reshape

cat("\nLab data in long format (TIDY):\n")
print(lab_long)

# 1b. Create a summary table showing mean values by parameter and visit
lab_summary <- # YOUR CODE HERE - group by PARAM and VISIT, calculate mean

cat("\nLab summary:\n")
print(lab_summary)

# ===========================
# EXERCISE 2: Set Up Clinical Domains (R4DS Ch. 13)
# ===========================

# Create sample clinical datasets
dm <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005", "001-006"),
  AGE = c(45, 67, 52, 71, 34, 58),
  SEX = c("M", "F", "M", "F", "M", "F"),
  ARMCD = c("TRT", "PBO", "TRT", "TRT", "PBO", "TRT"),
  ARM = c("Treatment", "Placebo", "Treatment", "Treatment", "Placebo", "Treatment"),
  RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20")
) %>%
  mutate(ELDERLY = ifelse(AGE >= 65, "Y", "N"))

ae <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-004", "001-007", "001-007"),
  AESEQ = c(1, 2, 1, 1, 1, 2),
  AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "DIZZINESS", "HEADACHE", "VOMITING"),
  AESEV = c("MILD", "MODERATE", "MILD", "SEVERE", "MILD", "MODERATE"),
  AESTDTC = c("2024-01-20", "2024-01-22", "2024-01-21", "2024-01-25", "2024-02-01", "2024-02-03")
)

vs <- tibble(
  USUBJID = rep(c("001-001", "001-002", "001-003", "001-004"), each = 2),
  VSTESTCD = rep(c("SYSBP", "DIABP"), 4),
  VSSTRESN = c(120, 80, 135, 85, 115, 75, 140, 90),
  VISIT = rep("Baseline", 8)
)

# Display the datasets
cat("Demographics (DM):\n")
print(dm)
cat("\nAdverse Events (AE):\n")
print(ae)
cat("\nVital Signs (VS):\n")
print(vs)

# ===========================
# EXERCISE 3: Basic Join Operations (R4DS Ch. 13)
# ===========================

# 3a. LEFT JOIN: Add demographics to adverse events
ae_with_demo <- # YOUR CODE HERE - left_join AE and DM

cat("\nAE with demographics (left join):\n")
print(ae_with_demo)

# 3b. INNER JOIN: Keep only AEs for subjects with complete demographics
ae_complete <- # YOUR CODE HERE - inner_join AE and DM

cat("\nComplete AE data (inner join):\n")
print(ae_complete)

# 3c. Count how many subjects are in each dataset
cat("\nSubject counts:\n")
cat("DM subjects:", n_distinct(dm$USUBJID), "\n")
cat("AE subjects:", n_distinct(ae$USUBJID), "\n")
cat("AE with demo (left join):", n_distinct(ae_with_demo$USUBJID), "\n")
cat("AE complete (inner join):", n_distinct(ae_complete$USUBJID), "\n")

# ===========================
# EXERCISE 4: Filtering Joins (R4DS Ch. 13)
# ===========================

# 4a. SEMI JOIN: Find subjects who had any adverse events
subjects_with_ae <- # YOUR CODE HERE - semi_join DM and AE

cat("\nSubjects with AEs (semi join):\n")
print(subjects_with_ae)

# 4b. ANTI JOIN: Find subjects who had NO adverse events
subjects_without_ae <- # YOUR CODE HERE - anti_join DM and AE

cat("\nSubjects without AEs (anti join):\n")
print(subjects_without_ae)

# ===========================
# EXERCISE 5: Group By and Summarise Practice
# ===========================

# 5a. AE summary by treatment group
ae_by_treatment <- ae_with_demo %>%
  filter(!is.na(ARMCD)) %>%  # Exclude subjects not in DM
  group_by(ARMCD) %>%
  summarise(
    # YOUR CODE HERE - calculate n_subjects, n_events, mean_age
  )

cat("\nAE summary by treatment:\n")
print(ae_by_treatment)

# 5b. AE summary by elderly status and severity
ae_by_elderly_severity <- # YOUR CODE HERE - group by ELDERLY and AESEV

cat("\nAE summary by elderly status and severity:\n")
print(ae_by_elderly_severity)

# ===========================
# EXERCISE 6: Complex Analysis Pipeline
# ===========================

# Create a comprehensive analysis that:
# 1. Joins AE with DM
# 2. Calculates study day from AE start date
# 3. Creates severity flag for severe AEs
# 4. Summarises by treatment and elderly status

clinical_summary <- ae %>%
  # YOUR CODE HERE - complete the pipeline
  # Hints: left_join, mutate with ymd(), group_by, summarise

cat("\nClinical summary:\n")
print(clinical_summary)

# ===========================
# EXERCISE 7: Pivot Practice with Vital Signs
# ===========================

# 7a. Create a wide format summary of vital signs by treatment
vs_summary <- vs %>%
  left_join(dm %>% select(USUBJID, ARMCD), by = "USUBJID") %>%
  group_by(ARMCD, VSTESTCD) %>%
  summarise(mean_val = round(mean(VSSTRESN), 1), .groups = "drop")

cat("\nVS summary (long format):\n")
print(vs_summary)

# 7b. Pivot to wide format for reporting
vs_wide <- # YOUR CODE HERE - use pivot_wider

cat("\nVS summary (wide format):\n")
print(vs_wide)

# ===========================
# EXERCISE 8: Missing Data Analysis
# ===========================

# 8a. Identify subjects in AE but not in DM
missing_subjects <- # YOUR CODE HERE - find subjects in AE but not DM

cat("\nSubjects in AE but not in DM:\n")
print(missing_subjects)

# 8b. Create a data quality report
data_quality <- tibble(
  Dataset = c("DM", "AE", "VS"),
  N_Subjects = c(
    n_distinct(dm$USUBJID),
    n_distinct(ae$USUBJID),
    n_distinct(vs$USUBJID)
  )
)

cat("\nData quality report:\n")
print(data_quality)

# ===========================
# EXERCISE COMPLETE!
# ===========================

cat("\n🎉 Module 3 Exercise Complete!\n")
cat("You practiced:\n")
cat("- Tidy data principles with pivot_longer() and pivot_wider()\n")
cat("- All types of joins: left, inner, semi, anti\n")
cat("- Group by and summarise operations\n")
cat("- Complex clinical data analysis workflows\n")
cat("- Data quality checks and missing data analysis\n")
cat("\nGreat job! Ready for Module 4!\n")
